{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4b735b52",
   "metadata": {},
   "source": [
    "## 中文分词"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cac178ca",
   "metadata": {},
   "source": [
    "### jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c83f0882",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b5db981",
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba  # 也可以对繁体中文进行分词\n",
    "content = \"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"\n",
    "result = jieba.lcut(content, cut_all=False)  # False表示精确模式，cut返回迭代器对象，lcut返回列表\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b72a4b89",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_result = jieba.lcut_for_search(content)  # 搜索引擎模式，在精确模式的基础上，对长单词再进行分割\n",
    "search_result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6759c97",
   "metadata": {},
   "source": [
    "#### 使用自定义词典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20585349",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.makedirs('data', exist_ok=True)\n",
    "\n",
    "user_dict = os.path.join('data', 'user_dict.txt')\n",
    "dict_content = [\n",
    "    '云计算 5 n\\n'  # 词语 词频(optional) 词性(optional)\n",
    "    '李小福 2 nr\\n',\n",
    "    'easy_install 3 eng\\n',\n",
    "    '好用 300\\n',\n",
    "    '韩玉赏鉴 3 nz\\n',\n",
    "    '八一双鹿 3 nz\\n'\n",
    "]\n",
    "\n",
    "with open(user_dict, 'w', encoding='utf-8') as f:\n",
    "    f.writelines(dict_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a912bc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "before_dict = jieba.lcut('八一双鹿更名为八一南昌篮球队')\n",
    "print(before_dict)\n",
    "\n",
    "jieba.load_userdict(user_dict)\n",
    "after_dict = jieba.lcut('八一双鹿更名为八一南昌篮球队')\n",
    "print(after_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee60d7b8",
   "metadata": {},
   "source": [
    "### hanlp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48400183",
   "metadata": {},
   "source": [
    "中英文NLP处理工具包，基于TF2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17c839a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install hanlp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccdb4ca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import hanlp  # 中文分词\n",
    "\n",
    "content = \"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"\n",
    "tokenizer = hanlp.load('CTB6_CONVSEG')  # 加载CTB6_CONVSEG预训练模型进行分词任务\n",
    "result = tokenizer(content)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bec1c206",
   "metadata": {},
   "outputs": [],
   "source": [
    "from hanlp.utils.english_tokenizer import tokenize_english\n",
    "eng_content = \"Mr. Hankcs bought hankcs.com for 1.5 thousand dollars.\"\n",
    "\n",
    "eng_result = tokenize_english(eng_content)\n",
    "eng_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fcce309e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torchX",
   "language": "python",
   "name": "torchx"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
